1 Load the libraries

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.0     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.1     ✔ tibble    3.2.0
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout

2 Basic Graph Labels

ggplot(data = iris, aes(x = Sepal.Length, y = Sepal.Width)) +
    geom_point(aes(color=Species, shape=Species)) +
    labs(title = "Iris Sepal Length vs Wide", x = "Sepal Length", y = "Sepal Width", color = "Plant Species", shape = "Plant Species")

3 Themes

ggplot(data = iris, aes(x = Sepal.Length, y = Sepal.Width)) +
    geom_point(aes(color=Species, shape=Species)) +
    labs(title = "Iris Sepal Length vs Wide", x = "Sepal Length", y = "Sepal Width", color = "Plant Species", shape = "Plant Species") +
  theme_classic()

4 Colors

iris_example_plot1 <- ggplot(data = iris, aes(x = Sepal.Length, y = Sepal.Width)) +
    geom_point(color = "red", aes(shape = Species))+
    labs(title = "Iris Sepal Length vs Wide", x = "Sepal Length", y = "Sepal Width") 
ggplot(data = iris, aes(x = Sepal.Length, y = Sepal.Width)) +
    geom_point(aes(color = Species, shape = Species)) +
    scale_color_manual(values=c("blue", "purple", "red")) +
    labs(title = "Iris Sepal Length vs Wide", x = "Sepal Length", y = "Sepal Width") 

iris_example_plot2 <-ggplot(data = iris, aes(x = Sepal.Length, y = Sepal.Width)) +
    geom_point(aes(color = Species, shape = Species)) +
    scale_color_brewer(palette="Dark2") +
    labs(title = "Iris Sepal Length vs Wide", x = "Sepal Length", y = "Sepal Width") 
library(viridisLite)
  ggplot(data = iris, aes(x = Sepal.Length, y = Sepal.Width)) +
    geom_point(aes(fill = Species), color = "black", pch=21) +
    labs(title = "Iris Sepal Length vs Wide", x = "Sepal Length", y = "Sepal Width") 

library(viridisLite)
  ggplot(data = iris, aes(x = Sepal.Length, y = Sepal.Width)) +
    geom_point(aes(color = Species, shape = Species)) +
    scale_colour_viridis_d() +
    labs(title = "Iris Sepal Length vs Wide", x = "Sepal Length", y = "Sepal Width") 

5 Graphic Output

You can export as a pdf, svg, tiff, png, bmp, jpeg, and eps

# Plot graph to a pdf outputfile
#pdf("images/iris_example_plot1.pdf", width=6, height=3)
ggplot(data = iris, aes(x = Sepal.Length, y = Sepal.Width, color = Species)) + 
  geom_point() +
  labs(title = "Iris Sepal Length vs Wide", x = "Sepal Length", y = "Sepal Width") 

dev.off()
## null device 
##           1
# Plot graph to a png outputfile
ppi <- 300
#png("images/iris_example_plot2.png", width=6*ppi, height=4*ppi, res=ppi)
ggplot(data = iris, aes(x = Sepal.Length, y = Sepal.Width, color = Species)) + 
  geom_point()
dev.off()
## null device 
##           1

6 Interactive graphs

library(plotly)

# Version 1
ggplotly(
  ggplot(data = iris, aes(x = Sepal.Length, y = Sepal.Width, color = Species)) + 
    geom_point()
 )
# Version 2
p <- ggplot(data = iris, aes(x = Sepal.Length, y = Sepal.Width, color = Species)) + 
  geom_point()
ggplotly(p)

7 NEON examples

7.1 Load tables into R

NEON_MAGs <- read_csv("data/GOLD_NEON.csv") %>% 
  # remove columns that are not needed for data analysis
  select(-c(`GOLD Study ID`, `Bin Methods`, `Created By`, `Date Added`)) %>% 
  # create a new column with the Assembly Type
  mutate("Assembly Type" = case_when(`Genome Name` == "NEON combined assembly" ~ `Genome Name`,
                            TRUE ~ "Individual")) %>% 
  mutate_at("Assembly Type", str_replace, "NEON combined assembly", "Combined") %>% 
  separate(`GTDB-Tk Taxonomy Lineage`, c("Domain", "Phylum", "Class", "Order", "Family", "Genus"), "; ", remove = FALSE) %>% 
  # Get rid of the the common string "Soil microbial communities from "
  mutate_at("Genome Name", str_replace, "Terrestrial soil microbial communities from ", "") %>% 
  # Use the first `-` to split the column in two
  separate(`Genome Name`, c("Site","Sample Name"), " - ") %>% 
  # Get rid of the the common string "S-comp-1"
  mutate_at("Sample Name", str_replace, "-comp-1", "") %>%
  # separate the Sample Name into Site ID and plot info
  separate(`Sample Name`, c("Site ID","subplot.layer.date"), "_", remove = FALSE,) %>% 
  # separate the plot info into 3 columns
  separate(`subplot.layer.date`, c("Subplot", "Layer", "Date"), "-") 
## Rows: 1754 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (8): Bin ID, Genome Name, Bin Quality, Bin Lineage, GTDB-Tk Taxonomy L...
## dbl  (10): IMG Genome ID, Bin Completeness, Bin Contamination, Total Number ...
## date  (1): Date Added
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Warning: Expected 6 pieces. Additional pieces discarded in 29 rows [12, 32, 66, 79, 80,
## 88, 96, 102, 104, 240, 334, 386, 657, 790, 846, 931, 943, 983, 1041, 1095,
## ...].
## Warning: Expected 6 pieces. Missing pieces filled with `NA` in 429 rows [6, 7, 42, 49,
## 50, 55, 60, 83, 85, 97, 100, 105, 107, 113, 114, 116, 119, 125, 129, 130, ...].
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 624 rows [1131, 1132,
## 1133, 1134, 1135, 1136, 1137, 1138, 1139, 1140, 1141, 1142, 1143, 1144, 1145,
## 1146, 1147, 1148, 1149, 1150, ...].

Remove Archaea

NEON_MAGs_bact_ind <- NEON_MAGs %>% 
  filter(Domain == "Bacteria") %>% 
  filter(`Assembly Type` == "Individual") 

8 Bar plots

NEON_MAGs_bact_ind %>% 
ggplot(aes(x = Phylum)) +
  geom_bar() +
  coord_flip()

forcats puts counts in descending order

NEON_MAGs_bact_ind %>% 
ggplot(aes(x = fct_infreq(Phylum))) +
  geom_bar() +
  coord_flip()

or you can pipe it

NEON_MAGs_bact_ind %>% 
  count(Phylum) %>% 
ggplot(aes(x = Phylum, y = n)) +
  geom_col(stat = "identity") +
  coord_flip()
## Warning in geom_col(stat = "identity"): Ignoring unknown parameters: `stat`

in descending order:

NEON_MAGs_bact_ind %>% 
  count(Phylum) %>% 
ggplot(aes(x = reorder(Phylum, n), y = n)) +
  geom_col(stat = "identity") +
  coord_flip()
## Warning in geom_col(stat = "identity"): Ignoring unknown parameters: `stat`

8.1 Stacked vs multiple bar plots

NEON_MAGs_bact_ind %>% 
ggplot(aes(x = fct_rev(fct_infreq(Phylum)), fill = Site)) +
  geom_bar() +
  theme(legend.position = "bottom") +
  theme(legend.justification = "left") +
  theme(legend.key.size = unit( 0.1, 'cm')) +
  theme(legend.key.height = unit(0.1, 'cm')) +
  theme(legend.key.width = unit(0.1, 'cm')) +
  theme(legend.title = element_text(colour = "black", size = 4, face = "bold")) +
  theme(legend.text = element_text(colour = "black", size = 4)) +
  theme(legend.box.background = element_rect()) +
  theme(legend.box.margin = margin(4, 4, 4, 4)) +
  theme(legend.box.just = "left") +
  theme( axis.text.y = element_blank()) +
  scale_y_continuous(n.breaks = 10) +
  xlab("New Species") +
  ylab("Count") +
  labs(title = "Quality of New Species Data") +
  coord_flip()

NEON_MAGs_bact_ind %>% 
ggplot(aes(x = fct_rev(fct_infreq(Phylum)), fill = Site)) +
  geom_bar(position = "dodge") +
  theme(legend.position = "bottom") +
  theme(legend.justification = "left") +
  theme(legend.key.size = unit( 0.1, 'cm')) +
  theme(legend.key.height = unit(0.1, 'cm')) +
  theme(legend.key.width = unit(0.1, 'cm')) +
  theme(legend.title = element_text(colour = "black", size = 4, face = "bold")) +
  theme(legend.text = element_text(colour = "black", size = 4)) +
  theme(legend.box.background = element_rect()) +
  theme(legend.box.margin = margin(4, 4, 4, 4)) +
  theme(legend.box.just = "left") +
  theme( axis.text.y = element_blank()) +
  scale_y_continuous(n.breaks = 10) +
  xlab("New Species") +
  ylab("Count") +
  labs(title = "Quality of New Species Data") +
  coord_flip()

Bar width can be adjusted

NEON_MAGs_bact_ind %>% 
ggplot(aes(x = fct_rev(fct_infreq(Phylum)), fill = Site)) +
  geom_bar(position = position_dodge2(width = 0.9, preserve = "single")) +
  coord_flip()

8.1.1 Multiple panels (facet wrap)

NEON_MAGs_bact_ind %>% 
ggplot(aes(x = Phylum)) +
  geom_bar(position = position_dodge2(width = 0.9, preserve = "single")) +
  coord_flip() +
  facet_wrap(vars(Site), scales = "free", ncol = 2)

9 Histograms

NEON_MAGs_bact_ind %>% 
ggplot(aes(x = `Total Number of Bases`)) +
  geom_histogram(bins = 50) 

10 Box Plots

NEON_MAGs_bact_ind %>%   
ggplot(aes(x = fct_infreq(Phylum), y = `Total Number of Bases`)) +
  geom_boxplot() +
  theme(axis.text.x = element_text(angle=45, vjust=1, hjust=1))

10.1 Showing each point in the plot

NEON_MAGs_bact_ind %>%   
ggplot(aes(x = fct_infreq(Phylum), y = `Total Number of Bases`)) +
  geom_point() +
  coord_flip()

11 Exercises

For all exercises make complete graphs that are report ready. Relabel the x-axis, y-axis and legend for clarity, add a title, add color and size appropriately. The NAs in the taxonomy indicate a novel species starting with the highest level. For example a NA in a class that has an assigned phylum Proteobacteria would be a novel class in the phylum Proteobacteria.

11.1 Exercise 1

What are the overall class MAG counts?

The counts are displayed in the following graph:

NEON_MAGs_bact_ind %>% 
  count(Class) %>% 
ggplot(aes(x = reorder(Class, n), y = n)) +
  geom_bar(stat = "Identity", fill = "red") +
  xlab("Class") +
  ylab("Count") +
  labs(title = "MAG Counts by Class") +
  theme( axis.text.y = element_text(size = 4)) +
  coord_flip()

11.2 Exercise 2

What are the MAG counts for each subplot? Color by site ID.

NEON_MAGs_bact_ind %>% 
ggplot(aes(x = fct_rev(fct_infreq(Class)), fill = Site)) +
  geom_bar(position = position_dodge2(width = 0.5, preserve = "single")) +
 theme(legend.position = "bottom") +
  theme(legend.justification = "left") +
  theme(legend.key.size = unit( 0.1, 'cm')) +
  theme(legend.key.height = unit(0.1, 'cm')) +
  theme(legend.key.width = unit(0.1, 'cm')) +
  theme(legend.title = element_text(colour = "black", size = 2, face = "bold")) +
  theme(legend.text = element_text(colour = "black", size = 2)) +
  theme(legend.box.background = element_rect()) +
  theme(legend.box.margin = margin(4, 4, 4, 4)) +
  theme(legend.box.just = "left") +
  theme( axis.text.x = element_text(size = 4, angle = 90)) +
  xlab("Class") +
  ylab("Count") +
  labs(title = "MAG Counts by Class and Site")

11.3 Exercise 3

How many novel bacteria were discovered? Show the number of NAs for each site

NEON_MAGs_bact_ind %>%
  filter(is.na(Genus)) %>% 
ggplot(aes(x = fct_rev(fct_infreq(Genus)), fill = Site)) +
  geom_bar(position = position_dodge2(width = 0.5, preserve = "single")) +
 theme(legend.position = "bottom") +
  theme(legend.justification = "left") +
  theme(legend.key.size = unit( 0.1, 'cm')) +
  theme(legend.key.height = unit(0.1, 'cm')) +
  theme(legend.key.width = unit(0.1, 'cm')) +
  theme(legend.title = element_text(colour = "black", size = 4, face = "bold")) +
  theme(legend.text = element_text(colour = "black", size = 2.2)) +
  theme(legend.box.background = element_rect()) +
  theme(legend.box.margin = margin(4, 4, 4, 4)) +
  theme(legend.box.just = "left") +
  theme( axis.text.y = element_blank()) +
  xlab("New Species") +
  ylab("Count") +
  labs(title = "New Species at a Given Site") +
  coord_flip()

11.4 Exercise 4

How many novel bacterial MAGs are high vs medium quality?

NEON_MAGs_bact_ind %>%
  filter(is.na(Genus)) %>% 
ggplot(aes(x = fct_rev(fct_infreq(Genus)), fill = `Bin Quality`)) +
  geom_bar(position = position_dodge2(width = 0.9, preserve = "single")) +
 theme(legend.position = "bottom") +
  theme(legend.justification = "left") +
  theme(legend.key.size = unit( 0.1, 'cm')) +
  theme(legend.key.height = unit(0.1, 'cm')) +
  theme(legend.key.width = unit(0.1, 'cm')) +
  theme(legend.title = element_text(colour = "black", size = 4, face = "bold")) +
  theme(legend.text = element_text(colour = "black", size = 4)) +
  theme(legend.box.background = element_rect()) +
  theme(legend.box.margin = margin(4, 4, 4, 4)) +
  theme(legend.box.just = "left") +
  theme( axis.text.y = element_blank()) +
  scale_y_continuous(n.breaks = 10) +
  xlab("New Species") +
  ylab("Count") +
  labs(title = "Quality of New Species Data") +
  coord_flip()

11.5 Exercise 5

What phyla have novel bacterial genera?

NEON_MAGs_bact_ind %>%
  filter(is.na(Genus)) %>% 
ggplot(aes(x = fct_rev(fct_infreq(Genus)), fill = `Phylum`)) +
  geom_bar(position = position_dodge2(width = 0.9, preserve = "single")) +
 theme(legend.position = "bottom") +
  theme(legend.justification = "left") +
  theme(legend.key.size = unit( 0.1, 'cm')) +
  theme(legend.key.height = unit(0.1, 'cm')) +
  theme(legend.key.width = unit(0.1, 'cm')) +
  theme(legend.title = element_text(colour = "black", size = 4, face = "bold")) +
  theme(legend.text = element_text(colour = "black", size = 4)) +
  theme(legend.box.background = element_rect()) +
  theme(legend.box.margin = margin(4, 4, 4, 4)) +
  theme(legend.box.just = "left") +
  theme( axis.text.y = element_blank()) +
  scale_y_continuous(n.breaks = 20) +
  xlab("New Genera") +
  ylab("Count") +
  labs(title = "Number of Novel Genera per Phylum") +
  coord_flip()

11.6 Exercise 6

Make a stacked bar plot of the total number of MAGs at each site using phylum as the fill.

NEON_MAGs_bact_ind %>% 
ggplot(aes(x = fct_rev(fct_infreq(Site)), fill = Phylum)) +
  geom_bar() +
  theme(legend.position = "bottom") +
  theme(legend.justification = "left") +
  theme(legend.key.size = unit( 0.1, 'cm')) +
  theme(legend.key.height = unit(0.1, 'cm')) +
  theme(legend.key.width = unit(0.1, 'cm')) +
  theme(legend.title = element_text(colour = "black", size = 4, face = "bold")) +
  theme(legend.text = element_text(colour = "black", size = 4)) +
  theme(legend.box.background = element_rect()) +
  theme(legend.box.margin = margin(4, 4, 4, 4)) +
  theme(legend.box.just = "left") +
  theme( axis.text.y = element_text(size = 4)) +
  scale_y_continuous(n.breaks = 12) +
  xlab("Site") +
  ylab("Count") +
  labs(title = "Number of MAGs per Site by Plyla") +
  coord_flip()

11.7 Exercise 7

Using facet_wrap make plots of the total number of MAGS at each site for each phylum.

NEON_MAGs_bact_ind %>% 
ggplot(aes(x = Site)) +
  geom_bar(position = position_dodge2(width = 3, preserve = "single")) +
  theme(axis.text.y = element_text(size = 2)) +
  theme(axis.text.x = element_text(size = 4)) +
  theme(axis.title.x = element_text(size = 6)) +
  theme(axis.title.y = element_text(size = 6)) +
  theme(axis.title.y.left = element_text(size = 2)) +
  theme(strip.background = element_blank(),
        strip.text = element_text(size = rel(0.3), margin = margin()),
        panel.spacing = unit(3, "pt")) +
  coord_flip() +
  scale_y_continuous(n.breaks = 12) +
  facet_wrap(vars(Phylum), scales = "fixed") +
   xlab("Site") +
  ylab("Total MAGs") +
  labs(title = "Total number of MAGs per Site") 

11.8 Exercise 8

What is the relationship between MAGs genome size and the number of genes? Color by phylum.

  ggplot(data = NEON_MAGs_bact_ind, aes(x = `Total Number of Bases`, y = `Gene Count`, color = Phylum)) + 
    geom_point() +
  theme(legend.position = "bottom") +
  theme(legend.justification = "left") +
  theme(legend.key.size = unit( 0.1, 'cm')) +
  theme(legend.key.height = unit(0.1, 'cm')) +
  theme(legend.key.width = unit(0.1, 'cm')) +
  theme(legend.title = element_text(colour = "black", size = 4, face = "bold")) +
  theme(legend.text = element_text(colour = "black", size = 4)) +
  theme(legend.box.background = element_rect()) +
  theme(legend.box.margin = margin(4, 4, 4, 4)) +
  theme(legend.box.just = "left") +
  theme( axis.text.y = element_text(size = 4)) +
  scale_y_continuous(n.breaks = 12) +
  xlab("Total Bases") +
  ylab("Gene Count") +
  labs(title = "Genome Size and Gene Number by Phylum") 

There is a strong positive correlation.

11.9 Exercise 9

What is the relationship between scaffold count and MAG completeness?

ggplot(data = NEON_MAGs_bact_ind, aes(x = `Scaffold Count`, y = `Bin Completeness`,)) + 
    geom_point(color = "blue") +
  theme( axis.text.y = element_text(size = 10)) +
  scale_y_continuous(n.breaks = 12) +
  xlab("Scaffold Count") +
  ylab("Percent Complete") +
  labs(title = "Scaffold Count and Bin Completeness") 

There seems to be a slight negative correlation.